An Exploration of US public health data¶
This is an excercise in data analysis where a dataset focused on public health for the US with resolution at the county level is explored and a model for predicting life expectancy is trained. This is a personal learning project with no pretense at drawing causations or conclusions.
- US Data
- Correlation Matrix
- Plotting of correlated columns
- Obesity Map
- Watershed Analysis
- Hotspot Analysis
- Machine Learning Models
Loading and preparing data for US
Source: County Health Rankings & Roadmap, a project by the University of Wisconsin
The original file was edited in Excel to keep only the relevant columns and to allow for easier editing in Pandas.
Columns are:
- % Adults with Obesity (Percentage of the adult population that reports a body mass index (BMI) greater than or equal to 30 kg/m2 - same definition as in the EU dataset)
- % Physically Inactive
- % Excessive Drinking
- Food Environment Index (Access to healthy foods by considering the distance an individual lives from a grocery store or supermarket, locations for health food purchases in most communities, and the inability to access healthy food because of cost barriers)
- % Fair or Poor Health
- Income Ratio (Ratio of household income at the 80th percentile to income at the 20th percentile; higher means higher inequality)
- Life Expectancy
- Median Household Income
- Gender Pay Gap (Ratio of women's median earnings to men's median earnings; 1 is equal pay)
- % Some College
- Water Violation (Indicator of the presence of health-related drinking water violations)
# Core data libraries
import pandas as pd
import numpy as np
import geopandas as gpd
# Visualization
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from plotly.subplots import make_subplots
import seaborn as sns
import matplotlib.pyplot as plt
import kaleido
# Statistics & spatial analysis
from scipy import stats
from scipy.stats import pearsonr
from libpysal.weights import Queen
from esda.getisord import G_Local
from esda.moran import Moran_Local
from shapely.geometry import Point
# ML models & metrics
from sklearn.model_selection import (train_test_split, cross_val_score, KFold,GridSearchCV)
from sklearn.metrics import (mean_absolute_error, mean_squared_error, r2_score)
from sklearn.linear_model import (LinearRegression, Ridge, ElasticNet)
from sklearn.ensemble import (RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor)
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
# Settings
%matplotlib inline
import warnings
import logging
pio.renderers.default = 'notebook'
pd.set_option('display.float_format', lambda x: '%.5f' % x)
warnings.filterwarnings('ignore')
logging.getLogger('lightgbm').setLevel(logging.WARNING)
# Load database
df=pd.read_excel("US Data.xlsx")
df["FIPS"]=df["FIPS"].fillna(0).astype(int).astype(str).str.zfill(5)
# Load data for upstream population computed from code in https://github.com/luizedu91/watersheds/blob/main/watershed.ipynb
df_watershed=gpd.read_file('counties_with_upstream_data.gpkg', engine= 'pyogrio', use_arrow=True)[['FIPS','POPULATION',"upstream_area","upstream_population"]]
df=df.merge(df_watershed, on=['FIPS'], how='left')
# Load file with county geometry data
counties_geo = gpd.read_file('cb_2023_us_all_500k.gdb', layer= 'cb_2023_us_county_500k', engine= 'pyogrio', use_arrow= True)[['GEOID','ALAND','geometry']].rename(columns={"GEOID":"FIPS"})
# Fill missing geo data for CT (database uses deprecated counties format for the state)
connecticut= gpd.read_file('Connecticut/cb_2019_09_bg_500k.shp', engine= 'pyogrio', use_arrow= True)[['STATEFP','COUNTYFP',"geometry"]]
connecticut["FIPS"]=connecticut[['STATEFP','COUNTYFP']].agg(''.join, axis=1)
connecticut.drop(['STATEFP','COUNTYFP'], axis=1,inplace=True)
connecticut = connecticut.dissolve(by="FIPS").reset_index()
# Concat both geo dataframes
counties_geo= pd.concat([counties_geo, connecticut])
# Merge to final dataframe and clean data
df = df[df["FIPS"].notnull()]
df["FIPS"] = df["FIPS"].astype(str).str.zfill(5)
df = df.merge(counties_geo, on=['FIPS'], how='left')
df = gpd.GeoDataFrame(df, geometry="geometry")
df = df[df.geometry.notnull()]
df['geometry'] = df.to_crs(df.estimate_utm_crs()).simplify(10000).to_crs(df.crs)
# Convert to GeoJSON format for Plotly
counties_json = df.__geo_interface__
Air Contamination Data
# Add air quality information
air=pd.read_csv('annual_aqi_by_county_2024.csv')
air['Air Quality: % Bad Days'] = (air['Unhealthy for Sensitive Groups Days'] + air['Unhealthy Days'] + air['Very Unhealthy Days'] + air['Hazardous Days'])*100/ air['Days with AQI']
air['Air Quality: % High Pollutant Days'] =(air['Days Ozone']+air['Days PM10']+air['Days CO']+air['Days NO2']+air['Days Ozone']+air['Days PM2.5']) *100 / air['Days with AQI'] # Measure may sum to more than 1.0
air.drop(columns=['Year', 'Days with AQI', 'Good Days', 'Moderate Days', 'Unhealthy for Sensitive Groups Days', 'Unhealthy Days', 'Very Unhealthy Days', 'Hazardous Days', 'Max AQI', '90th Percentile AQI', 'Median AQI', 'Days PM10', 'Days CO', 'Days NO2', 'Days Ozone', 'Days PM2.5'], axis=1, inplace=True)
df=pd.merge(df, air, on=['State', 'County'], how='left')
Water Contamination Data
Resulting column will contain the number of occurences of high level of some contaminant
water = pd.read_csv('SDWA_VIOLATIONS_ENFORCEMENT.csv', usecols=['PWSID', 'CONTAMINANT_CODE'], dtype = {'PWSID': str, 'CONTAMINANT_CODE': str})
water_codes = pd.read_csv('SDWA_REF_CODE_VALUES.csv', usecols=['VALUE_CODE','VALUE_DESCRIPTION'], dtype = {'VALUE_CODE': str, 'VALUE_DESCRIPTION': str})
# Merge datasets to include contaminant names, not only codes
water=pd.merge(water, water_codes, left_on='CONTAMINANT_CODE', right_on='VALUE_CODE', how='left')
# Group by source and type of contamination
water = water.groupby(['PWSID', 'VALUE_DESCRIPTION']).size().reset_index(name='Infraction_Count')
# Merge with another dataset to include ZIP codes for each row
water_merged = pd.merge(water, pd.read_csv('SDWA_PUB_WATER_SYSTEMS.csv', usecols=['PWSID', 'ZIP_CODE']), on='PWSID', how='left')
# Clean ZIP codes
def five_digits(zip_code):
if isinstance(zip_code, str) and len(zip_code) > 5:
return zip_code[0:5]
return zip_code
water_merged['ZIP_CODE']=water_merged['ZIP_CODE'].apply(five_digits)
# Merge according to ZIP codes in order to have FIPS codes
zip=pd.read_csv('ZIP-COUNTY-FIPS_2017-06.csv', usecols=['ZIP', 'STCOUNTYFP'], dtype={'ZIP':str, 'STCOUNTYFP': str})
water_merged=pd.merge(water_merged, zip, left_on="ZIP_CODE", right_on='ZIP', how='left')
water_merged.drop(columns=['PWSID','ZIP_CODE','ZIP'], inplace=True)
# Pivot so each row is one county
water_merged_pivot = water_merged.pivot_table(index='STCOUNTYFP', columns='VALUE_DESCRIPTION', values='Infraction_Count', aggfunc='sum', fill_value=0).reset_index()
# Categorize contaminants and clean dataset
categories = {
"Water Contaminants: Inorganic": [
"Lead", "COPPER, FREE", "Chromium", "Selenium", "Barium", "Cadmium", "Mercury", "Nickel", 'Beryllium, Total', "Antimony, Total", "Thallium, Total", "Fluoride", "Nitrite", "Nitrate-Nitrite", "CYANIDE", "Chlorite", "Bromate", "Chlorine dioxide","Arsenic", "Silver", "Sulfate"
],
"Water Contaminants: Organic": [
"Trichloroethylene", "Benzene", "1,1-Dichloroethylene", "Carbon tetrachloride", "1,1,1-Trichloroethane", "p-Dichlorobenzene", "1,2-Dichloroethane", "Tetrachloroethylene", "Toluene", "Ethylbenzene", "Xylenes, Total", "Vinyl chloride", "DICHLOROMETHANE", "cis-1,2-Dichloroethylene", "trans-1,2-Dichloroethylene", "Pentachlorophenol", "Di(2-ethylhexyl) phthalate", "HEXACHLOROBENZENE", "Hexachlorocyclopentadiene", "1,2,4-Trichlorobenzene", "Benzo(a)pyrene", "Atrazine", "Simazine", "Toxaphene", "Methoxychlor", "2,4-D", "2,4,5-TP","Carbofuran", "Glyphosate", "Aldicarb", "Aldicarb sulfoxide", "Aldicarb sulfone", "Endrin", "Heptachlor", "Heptachlor epoxide", "Picloram", "Diquat", "Endothall", "OXAMYL", "Dinoseb","1,1,1,2-Tetrachloroethane", "1,1,2,2-Tetrachloroethane", "1,1,2-Trichloroethane", "1,1-Dichloroethane", "1,1-Dichloropropene", "1,2,3-Trichloropropane", "1,2-Dichloropropane", "1,3-DICHLOROPROPENE", "1,3-Dichloropropane", "2,2-Dichloropropane", "Acrylamide", "Aldrin", "Aroclor 1016", "Aroclor 1221", "Aroclor 1232", "Aroclor 1242", "Aroclor 1248", "Aroclor 1254", "Aroclor 1260", "BHC-GAMMA", "Bis(2-ethylhexyl) phthalate", "Bromobenzene", "Bromodichloromethane", "Bromoform", "Bromomethane", "Butachlor", "CHLOROBENZENE", "Carbaryl", "Chlordane", "Chloroethane", "Chloroform", "Chloromethane", "Dalapon", "Di(2-ethylhexyl) adipate", "Dibromochloromethane", "Dibromomethane", "Dicamba", "Dieldrin", "Epichlorohydrin", "METHYL TERT-BUTYL ETHER", "Methomyl", "Metolachlor", "Metribuzin", "Propachlor", "Styrene", "m-Dichlorobenzene", "m-Xylene", "o-Chlorotoluene", "o-Dichlorobenzene", "o-Xylene", "p-Chlorotoluene", "p-Xylene"
],
"Water Contaminants: Microbiological": [
"E. COLI", "Cryptosporidium", "Coliform (Pre-TCR)", "COLIPHAGE", "Turbidity", "ENTEROCOCCI", "Fecal Coliform"
],
"Water Contaminants: Other": [
"Chlorine", "Asbestos", "CARBON, TOTAL", "ETHYLENE DIBROMIDE", "1,2-DIBROMO-3-CHLOROPROPANE", "Total Polychlorinated Biphenyls (PCB)", "3-Hydroxycarbofuran", "3-Nitroaniline", "Residual Chlorine", "2,3,7,8-TCDD", "LASSO", "Total Haloacetic Acids (HAA5)", "TTHM", "Bromate", "Chloramine","Gross Alpha, Excl. Radon and U", "Gross Beta Particle Activity", "Tritium", "38-STRONTIUM-90", "53-IODINE-131", "ManMade Beta Particle and Photon Emitter", "Combined Uranium", "Radium-226", "Radium-228", "Combined Radium (-226 and -228)"
],
}
to_drop=['Coliform (TCR)', 'Native American Tribe', 'Not applicable', 'SCALE NOT APPLICABLE TO COLLECTION METHOD', 'Unknown Agency Type', 'Unknown Area Type', 'Surface Water Treatment Rule','Consumer Confidence Rule', "Filter Backwash Rule", 'Unknown Enforcement Action Type', 'Public Notice', 'Unknown Tribe','Revised Total Coliform Rule', 'Nitrate', "Lead and Copper Rule", "Stage 1 Disinfectants and Disinfection Byproducts Rule", "Stage 2 Disinfectants and Disinfection Byproducts Rule", "Interim Enhanced Surface Water Treatment Rule", "Long Term 2 Enhanced Surface Water Treatment Rule", "Groundwater Rule"]
for category, cols in categories.items():
water_merged_pivot[category] = water_merged_pivot[cols].sum(axis=1)
columns_to_drop = [column for category in categories.values() for column in category] + to_drop
water_merged_pivot.drop(columns=[columns for columns in columns_to_drop], inplace=True)
# Merge to original dataframe
df=pd.merge(df,water_merged_pivot, left_on="FIPS", right_on="STCOUNTYFP", how='left')
df.drop(columns=['STCOUNTYFP'], inplace=True)
df.rename(columns={'POPULATION':'Population', 'upstream_area':"Upstream Area", 'upstream_population':"Upstream Population", 'ALAND':"Area"}, inplace=True)
# Saving df to load more easily when exported
#df.to_parquet("Public_Health.parquet")
1. Socioeconomic Relationships¶
1.1 Exploratory Data Analysis¶
df[['% Adults with Obesity', '% Physically Inactive', '% Excessive Drinking','Food Environment Index', '% Fair or Poor Health', 'Income Ratio', 'Life Expectancy', 'Median Household Income', 'Gender Pay Gap','% Some College','Air Quality: % Bad Days', 'Air Quality: % High Pollutant Days', 'Water Contaminants: Inorganic', 'Water Contaminants: Organic','Water Contaminants: Microbiological', 'Water Contaminants: Other']].describe().loc[['count', 'mean', 'std', 'min', 'max']]
| % Adults with Obesity | % Physically Inactive | % Excessive Drinking | Food Environment Index | % Fair or Poor Health | Income Ratio | Life Expectancy | Median Household Income | Gender Pay Gap | % Some College | Air Quality: % Bad Days | Air Quality: % High Pollutant Days | Water Contaminants: Inorganic | Water Contaminants: Organic | Water Contaminants: Microbiological | Water Contaminants: Other | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 3143.00000 | 3143.00000 | 3143.00000 | 3108.00000 | 3143.00000 | 3128.00000 | 3070.00000 | 3142.00000 | 3137.00000 | 3137.00000 | 960.00000 | 960.00000 | 3138.00000 | 3138.00000 | 3138.00000 | 3138.00000 |
| mean | 37.35221 | 26.66258 | 16.86739 | 7.54305 | 17.72603 | 4.55036 | 75.75492 | 63267.72470 | 0.78148 | 58.87165 | 0.99144 | 154.29668 | 518.63448 | 1874.89930 | 55.51179 | 941.20268 |
| std | 4.55862 | 5.23175 | 2.62241 | 1.19756 | 4.54661 | 0.80724 | 3.43361 | 16275.49141 | 0.10311 | 11.85110 | 3.11262 | 36.69193 | 1177.06721 | 4493.47872 | 129.46044 | 2347.42923 |
| min | 17.40000 | 12.00000 | 9.03810 | 0.00000 | 8.40000 | 1.65164 | 55.91896 | 28972.00000 | 0.25873 | 11.76471 | 0.00000 | 100.00000 | 0.00000 | 0.00000 | 0.00000 | 0.00000 |
| max | 52.50000 | 47.00000 | 26.79776 | 10.00000 | 38.00000 | 10.53234 | 98.90294 | 167605.00000 | 1.72807 | 100.00000 | 40.00000 | 200.00000 | 28674.00000 | 66094.00000 | 1909.00000 | 28344.00000 |
- Apart from the air quality dataset, the data is quite complete, with little difference in Count between columns
- There is significant variation in every measure within the country.
- Life expectancy ranges from 56 (equivalent to Chad or Lesotho) to 99 (higher than the highest national average, Japan - 85)
- College education ranges from 11% to 100%
- Income ranges from USD 29k to USD 167k
- Obesity ranges from 17% to 52%
1.2 Correlation Analysis¶
# Correlation Matrix
corr=df[['Median Household Income', '% Fair or Poor Health', '% Physically Inactive','% Adults with Obesity','Food Environment Index','Life Expectancy','% Excessive Drinking','Income Ratio','% Some College', 'Air Quality: % Bad Days', 'Air Quality: % High Pollutant Days', 'Water Contaminants: Inorganic','Water Contaminants: Organic', 'Water Contaminants: Microbiological', 'Water Contaminants: Other']].corr()
plt.figure(figsize=(10, 8))
sns.heatmap(corr,
annot=True,
cmap='RdYlBu',
mask=np.triu(np.ones_like(corr, dtype=bool)),
cbar=False)
plt.title('Correlation Matrix of Key Health and Social Metrics');
Insights from the correlation matrix
Strong correlations:
Lower income - poor health - physical inactivity - obesity
Food Environment Index - higher income - better health metrics
Shorter life expectancies - poor health at present - inactivity - obesity - lower income - less access to fresh food - less education - excessive drinking
Moderate correlations:
Education - improvement in all metrics
Higher inequality - worse in all metrics
There was no relevant correlations between gender pay gap or water violations and any public health metrics and therefore they were dropped from the graph for better visualization. Another indication that water quality has little influence on the measured health outcomes.
This suggests socioeconomic status significantly impacts health outcomes, with physical inactivity being a key mediating factor.
1.3 Plots for significant correlations¶
# Interactive Bubble Plot using Plotly
fig = px.scatter(df.dropna(subset= ['Median Household Income','% Fair or Poor Health','% Adults with Obesity','Food Environment Index','County','% Physically Inactive']),
x='Median Household Income',
y='% Fair or Poor Health',
size='% Adults with Obesity',
color='% Physically Inactive',
color_continuous_scale="RdYlBu_r",
hover_data=['County', 'State', 'Food Environment Index'],
title='Poor Health vs. Income, Color by Inactivity, Size by Obesity')
fig.update_traces(marker=dict(sizeref=0.06))
fig.show()
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12, 5))
# Plot 1
sns.scatterplot(data=df,
x='Median Household Income',
y='Food Environment Index',
hue='% Fair or Poor Health',
ax=axes[0],
palette='RdYlBu_r',
hue_norm=(df['% Fair or Poor Health'].min(), df['% Fair or Poor Health'].max()))
axes[0].set_title('Income vs Poor Health')
# Plot 2
sns.scatterplot(data=df,
y='% Adults with Obesity' ,
x='% Fair or Poor Health',
hue='Life Expectancy',
palette='RdYlBu',
ax=axes[1])
axes[1].set_title('Obesity vs Poor Health - Inverted Scales')
axes[1].invert_yaxis()
axes[1].invert_xaxis()
plt.tight_layout()
plt.show()
Lower income counties have lower accessibility to fresh produce, both of which go hand in hand with higher obesity and poorer health.
2. Geographic Relationships¶
2.1 US Maps¶
Median Income
# Median Household Income Map
fig = px.choropleth_mapbox(
df,
geojson=counties_json,
locations=df.index,
color='Median Household Income',
color_continuous_scale="Turbo",
mapbox_style="open-street-map",
hover_data=['County', 'State'],
center={"lat": 39.5, "lon": -93.5},
opacity=0.7,
labels={'County':"County", 'Median Household Income': 'Median Household Income'},
)
# Update layout
fig.update_layout(
margin={"r":0,"t":0,"l":0,"b":0},
mapbox=dict(
zoom=3
),
height=400)
Life Expectancy
# Life Expectancy Map
fig = px.choropleth_mapbox(
df,
geojson=counties_json,
locations=df.index,
color='Life Expectancy',
color_continuous_scale="Turbo",
mapbox_style="open-street-map",
hover_data=['County', 'State'],
center={"lat": 39.5, "lon": -93.5},
opacity=0.7,
labels={'County':"County", 'Life Expectancy': 'Life Expectancy'},
title = 'Life Expectancy'
)
# Update layout
fig.update_layout(
margin={"r":0,"t":0,"l":0,"b":0},
mapbox=dict(
zoom=3
),
height=400)
Obesity
# Obesity map
fig = px.choropleth_mapbox(
df,
geojson=counties_json,
locations=df.index,
color='% Adults with Obesity',
color_continuous_scale="Turbo",
mapbox_style="open-street-map",
hover_data=['County', 'State'],
center={"lat": 39.5, "lon": -93.5},
opacity=0.7,
labels={'County':"County", '% Adults with Obesity': '% Adults with Obesity'}
)
# Update layout
fig.update_layout(
margin={"r":0,"t":0,"l":0,"b":0},
mapbox=dict(
zoom=3
),
height=400)
2.2 Watershed analysis¶
There appears to be some correlation between the lower Mississippi watershed and higher obesity downstream. One hypothesis is that the pollution from population and agricultural and industrial activity accumulates and affectis the health of populations downstream disproportionatelly in comparison to populations upstream.
-2.jpg)
# Filter outliers on Upstream Population due to calculation errors to improve readability of the color scale
upper_bound = df['Upstream Population'].quantile(0.99)
df.loc[df['Upstream Population'] > upper_bound, 'Upstream Population'] = 0
df['Upstream Population']=df['Upstream Population'].fillna(0)
df['Upstream Population'].fillna(0, inplace=True)
# Create choropleth map for upstream population
fig = px.choropleth_mapbox(
df,
geojson=counties_json,
locations=df.index,
color='Upstream Population',
color_continuous_scale="Turbo",
mapbox_style="open-street-map",
hover_data=['County', 'State'],
center={"lat": 39.5, "lon": -93.5},
opacity=0.7,
labels={'Upstream Population': 'Upstream Population'}
)
# Update layout
fig.update_layout(
margin={"r":0,"t":0,"l":0,"b":0},
mapbox=dict(
zoom=3,
style="open-street-map"),
height=400)
# Correlations between the calculated upstream population
corr=df[['Upstream Population', "% Adults with Obesity"]].corr()['Upstream Population']['% Adults with Obesity']
print(f"Correlation with obesity: {corr:.3f}")
corr=df[['Upstream Population', "% Fair or Poor Health"]].corr()['Upstream Population']['% Fair or Poor Health']
print(f"Correlation with poor health: {corr:.3f}")
Correlation with obesity: -0.094 Correlation with poor health: 0.002
Irrelevant correlation.
Attempt at correlating only the Mississippi watershed
# Create mask only for the Mississippi watershed
from shapely.geometry import Polygon
# Approximate bounding box for Mississippi watershed
coords = [(-92.5, 29.0), # Bottom west
(-115, 55), # Top west
(-83, 55), # Top east
(-90, 29.0), # Bottom east
(-95.5, 29.0)] # Close polygon
boundary = Polygon(coords)
# Filter counties that intersect with the bounding box
mississippi_mask = df.intersects(boundary)
mississippi_region = df[mississippi_mask].copy()
# Correlation between the calculated upstream population and obesity
corr=mississippi_region[['Upstream Population', "% Adults with Obesity"]].corr()['Upstream Population']['% Adults with Obesity']
print(f"Correlation with obesity: {corr:.3f}")
corr=mississippi_region[['Upstream Population', "% Fair or Poor Health"]].corr()['Upstream Population']['% Fair or Poor Health']
print(f"Correlation with poor health: {corr:.3f}")
Correlation with obesity: 0.039 Correlation with poor health: 0.175
# Testing for significance of correlation with poor health
aligned_data = pd.concat([mississippi_region['Upstream Population'], mississippi_region['% Fair or Poor Health']], axis=1).dropna()
x = aligned_data.iloc[:, 0]
y = aligned_data.iloc[:, 1]
correlation, p_value = pearsonr(x, y)
print(f"Correlation Coefficient: {correlation:.3f}")
print(f"P-value: {p_value:.10f}")
Correlation Coefficient: 0.175 P-value: 0.0000000011
Irrelevant correlation with obesity. Watershed hypothesis for this should be dropped.
However, although 0.175 is a low correlation explaining only about 3% of the variance, given the size of the dataset, it is a highly significant result
2.3 Hotspots Maps¶
Hotspot (Red) Cluster: Areas with high values are surrounded by other areas with high values.
Coldspot (Blue) Cluster: Areas with low values are surrounded by other areas with low values.
Doughnut (Green) Outlier: Areas with high values are surrounded by areas with low values..
Diamond (Yellow) Outlier: Areas with low values are surrounded by areas with high values.
# Function for Local Moran's I Clustering and classification
def calculate_morans_i(df, column_name, p_threshold=0.05):
w = Queen.from_dataframe(df, use_index=True)
df_no_islands = df.drop(w.islands)
w_no_islands = Queen.from_dataframe(df_no_islands, use_index=True)
# Calculate Local Moran's I
values = df_no_islands[column_name].values
moran_loc = Moran_Local(values, w_no_islands)
# Classify significant clusters
cluster_labels = {
1: 'high_high', # High-High
2: 'low_low', # Low-Low
3: 'low_high', # Low-High
4: 'high_low'} # High-Low
significant = moran_loc.p_sim < p_threshold
for q, label in cluster_labels.items():
df_no_islands[label] = significant & (moran_loc.q == q)
return df_no_islands
# Calculate Moran's I for obesity
hotspots_moran = calculate_morans_i(df, '% Adults with Obesity')
# Define a mapping for colors and labels
color_label_mapping = {
'high_high': ('red', 'Hotspot'),
'low_low': ('blue', 'Coldspot'),
'high_low': ('green', 'Doughnut'),
'low_high': ('orange', 'Diamonds')}
# Apply color and label mapping
def assign_color_label(row):
for key, (color, label) in color_label_mapping.items():
if row[key]:
return color, label
return 'grey', 'None'
hotspots_moran[['color', 'hotspot_label']] = hotspots_moran.apply(lambda row: pd.Series(assign_color_label(row)), axis=1)
# Create the choropleth map with the data generated above
fig = px.choropleth_mapbox(
hotspots_moran,
geojson=counties_json,
locations=hotspots_moran.index,
color='hotspot_label',
color_discrete_map={
"Hotspot": "red",
"Coldspot": "blue",
"Diamonds": "yellow",
"Doughnut": "green",
"None": "grey"
},
hover_data=['County', 'State', '% Adults with Obesity'],
center={"lat": 39.5, "lon": -93.5},
opacity=0.7,
labels={'hotspot_label': 'Cluster Type'})
fig.update_layout(
margin={"r":0,"t":0,"l":0,"b":0},
mapbox=dict(
zoom=3,
style="open-street-map"),
height=400)